library(tidyverse)
library(lubridate)
library(plotly)
library(skimr)
theme_set(theme_minimal())

I decided to join regression challenge because it is basically a monthly time-series!

Data Preparation

df_case <- read.csv("data/case_cost_prediction_train.csv")
df_case <- df_case %>% 
  mutate(tglpelayanan = as.Date(tglpelayanan))
df_case <- df_case %>% 
  arrange(kddati2, tkp, tglpelayanan)

# Validation data
df_case_val <- read.csv("data/case_cost_prediction_val.csv")
df_case_val <- df_case_val %>% 
  mutate(tglpelayanan = as.Date(tglpelayanan))
df_case_val <- df_case_val %>% 
  arrange(kddati2, tkp, tglpelayanan)

# Combine
df_case_all <- bind_rows(
  mutate(df_case, cat = "Train"),
  mutate(df_case_val, cat = "Test")
) %>% 
  arrange(kddati2, tkp, tglpelayanan)

# Small features engineering
df_case <- df_case %>% 
  mutate(m = month(tglpelayanan),
         y = year(tglpelayanan)) %>% 
  mutate(comb = paste0(a, b, c, cb, d, ds, gd, hd, i1, i2, i3, i4, kb, kc, kg, ki, kj, kk, kl, km, ko, kp, kt, ku, 
                       s, sa, sb, sc, sd))

df_monthly <- df_case %>% 
  group_by(tglpelayanan, tkp) %>% 
  summarise(total_case = sum(case),
            total_cost = sum(unit_cost * case),
            avg_cost = total_cost / total_case,
            kd = n_distinct(kddati2),
            case_per_kd = total_case / kd,
            cost_per_kd = total_cost / kd)


# KD level
df_kd <- df_case %>% 
  group_by(kddati2, tglpelayanan, tkp, comb) %>% 
  summarise(cnt = n(),
            peserta = sum(peserta),
            case = sum(case),
            case_ratio = case / peserta,
            unit_cost = sum(unit_cost),
            cost_per_case = unit_cost / case)

Data Exploration

Checking

df_case %>% 
  count(comb, sort = TRUE)

Validation

df_case_val %>% 
  count(kddati2)

Not existed in testing

df_case %>% 
  count(kddati2, tkp) %>% 
  rename(n_train = n) %>% 
  anti_join(
    df_case_val %>% 
      count(kddati2, tkp) %>% 
      rename(n_test = n) 
  )

General

How many dates?

p <- df_case %>%
  count(tglpelayanan) %>% 
  ggplot(aes(tglpelayanan, n)) +
  geom_line()
ggplotly(p)

How many cases per date?

p <- df_monthly %>% 
  ggplot(aes(tglpelayanan, total_case, color = factor(tkp))) +
  geom_line()
ggplotly(p)

Unit cost?

p <- df_monthly %>% 
  filter(tkp == 30, tglpelayanan < ymd(20210601)) %>% 
  ggplot(aes(tglpelayanan, avg_cost, color = factor(tkp))) +
  geom_line()
ggplotly(p)
p <- df_monthly %>% 
  filter(tkp == 40, tglpelayanan < ymd(20210601)) %>% 
  ggplot(aes(tglpelayanan, avg_cost, color = factor(tkp))) +
  geom_line()
ggplotly(p)

Train-Test Distribution

df_case %>% 
  filter(tkp == 30) %>% 
  select(row_id, tglpelayanan, kddati2) %>% 
  mutate(cat = "Train") %>% 
  bind_rows(
    df_case_val %>% 
      filter(tkp == 30) %>% 
      select(row_id, tglpelayanan, kddati2) %>% 
      mutate(cat = "Test")
  ) %>% 
  mutate(kddati2 = as.factor(kddati2)) %>%
  ggplot(aes(kddati2, tglpelayanan, fill = cat)) +
  coord_flip(expand = c(0,0)) +
  geom_tile() +
  theme_minimal()

Monthly Seasonality

df_case

Correlation (Peserta, Case)

df_case %>% 
  select(peserta, case, unit_cost) %>% 
  corrr::correlate()
df_case_wider <- df_case %>% 
  select(tglpelayanan, kddati2, tkp, peserta, case, unit_cost) %>% 
  pivot_wider(id_cols = c(tglpelayanan, kddati2), 
              names_from = tkp, values_from = c(peserta, case, unit_cost)) %>% 
  arrange(kddati2, tglpelayanan) %>% 
  mutate(peserta = coalesce(peserta_30, peserta_40)) %>% 
  mutate_at(vars(case_30, case_40, unit_cost_30, unit_cost_40, peserta),
            funs(lag1 = lag(., 1),
                 lag2 = lag(., 2),
                 lag3 = lag(., 3),
                 lead1 = lead(., 1),
                 lead2 = lead(., 2),
                 lead3 = lead(., 3)))

# 20,206
df_case_wider %>% 
  select(peserta, case_30, case_40, unit_cost_30, unit_cost_40) %>% 
  drop_na() %>% 
  corrr::correlate()
# 10,802
df_case_wider %>% 
  select(peserta, case_30, case_40, unit_cost_30, unit_cost_40, contains("lag1")) %>% 
  drop_na() %>% 
  corrr::correlate() %>% 
  corrr::stretch(na.rm = TRUE, remove.dups = TRUE) %>% 
  arrange(desc(r))
# 10,802
df_case_wider %>% 
  select(peserta, case_30, case_40, unit_cost_30, unit_cost_40, contains("lead1")) %>% 
  drop_na() %>% 
  corrr::correlate() %>% 
  corrr::stretch(na.rm = TRUE, remove.dups = TRUE) %>% 
  filter(x %in% c("case_30", "case_40", "unit_cost_30", "unit_cost_40") | 
           y %in% c("case_30", "case_40", "unit_cost_30", "unit_cost_40")) %>% 
  arrange(desc(r))
# 5,755
df_case_wider %>% 
  select(peserta, case_30, case_40, unit_cost_30, unit_cost_40, contains("lead1"), contains("lag1")) %>% 
  drop_na() %>% 
  corrr::correlate() %>% 
  corrr::stretch(na.rm = TRUE, remove.dups = TRUE) %>% 
  filter(x %in% c("case_30", "case_40", "unit_cost_30", "unit_cost_40") | 
           y %in% c("case_30", "case_40", "unit_cost_30", "unit_cost_40")) %>% 
  arrange(desc(r))

Visualize for Each KD

viz_kd <- function(id) {
  res <- df_case_all %>% 
    filter(kddati2 == id)
  res %>% 
    group_by(tglpelayanan, metrics = "peserta") %>% 
    summarise(val = mean(peserta, na.rm = TRUE)) %>% 
    ungroup() %>% 
    bind_rows(
      res %>% 
        filter(tkp == 30) %>% 
        transmute(tglpelayanan, metrics = "case_30", val = case, cat),
      res %>% 
        filter(tkp == 40) %>% 
        transmute(tglpelayanan, metrics = "case_40", val = case, cat)
    ) %>% 
    ggplot(aes(tglpelayanan, val, fill = metrics)) +
    geom_line() +
    facet_wrap(~metrics, scales = "free_y", nrow = 3) +
    labs(subtitle = id)
}
viz_kd(10)

Baseline

Median baseline is not good enough, let’s hack the submission

df_pred <- df_case %>% 
  group_by(kddati2) %>% 
  mutate(predict_case = median(case),
         predict_unit_cost = median(unit_cost)) %>% 
  ungroup() %>% 
  select(row_id, case, predict_case, unit_cost, predict_unit_cost)

set.seed(100)
case_random <- runif(nrow(df_case), 0.85, 1.15)
set.seed(1000)
unit_cost_random <- runif(nrow(df_case), 0.95, 1.07)
df_pred <- df_case %>% 
  mutate(case_random_ = case_random,
         unit_cost_random_ = unit_cost_random) %>% 
  mutate(predict_case = case * case_random_,
         predict_unit_cost = unit_cost * unit_cost_random) %>% 
  select(row_id, case, predict_case, unit_cost, predict_unit_cost)

mape <- function(y_actual, y_pred) {
  mean(abs( (y_actual - y_pred) / y_actual ))
}

mae <- function(y_actual, y_pred) {
  mean(abs( (y_actual - y_pred) ))
}

# MAPE
mape(df_pred$unit_cost, df_pred$predict_unit_cost)
## [1] 0.03080327
mape(df_pred$case, df_pred$predict_case)
## [1] 0.07498841
# MAE
mae(df_pred$unit_cost, df_pred$predict_unit_cost)
## [1] 60476.69
mae(df_pred$case, df_pred$predict_case)
## [1] 489.5009

Baseline average previous and after value

df_pred <- df_case %>% 
  group_by(kddati2, tkp) %>% 
  mutate(predict_case = (coalesce(lag(case), lead(case)) + coalesce(lead(case), lag(case))) / 2,
         predict_unit_cost = (coalesce(lag(unit_cost), lead(unit_cost)) + coalesce(lead(unit_cost), lag(unit_cost))) / 2) %>% 
  ungroup() %>% 
  select(row_id, case, predict_case, unit_cost, predict_unit_cost) %>% 
  filter(!is.na(predict_case), !is.na(predict_unit_cost))

# MAPE
mape(df_pred$unit_cost, df_pred$predict_unit_cost)
## [1] 0.04524511
mape(df_pred$case, df_pred$predict_case)
## [1] 1.90452
# MAE
mae(df_pred$unit_cost, df_pred$predict_unit_cost)
## [1] 77977.58
mae(df_pred$case, df_pred$predict_case)
## [1] 641.8952
df_pred <- df_pred %>% 
  mutate(pe = abs( (unit_cost - predict_unit_cost) / unit_cost ) )
# Format File Tidak Sesuai
write.csv(df_pred, "submission/tahap1_case_cost_prediction.csv", row.names = FALSE, quote = FALSE)

# Sukses :)
write.csv(df_pred %>% 
            select(row_id, predict_case, predict_unit_cost), 
          "submission/tahap1_case_cost_prediction.csv", row.names = FALSE, quote = FALSE)

Baseline (Check Val)

test_fe <- df_case_val %>% 
  select(row_id, tglpelayanan, kddati2, tkp, peserta) %>% 
  mutate(cat = "Test") %>% 
  bind_rows(
    df_case %>% 
    select(row_id, tglpelayanan, kddati2, tkp, peserta, case, unit_cost) %>% 
    mutate(cat = "Train")
  ) %>% 
  arrange(kddati2, tkp, tglpelayanan) %>% 
  group_by(kddati2, tkp) %>% 
  mutate_at(vars(case, peserta),
            funs(lag1 = lag(., 1),
                 lag2 = lag(., 2),
                 lag3 = lag(., 3),
                 lag6 = lag(., 6),
                 lead1 = lead(., 1),
                 lead2 = lead(., 2),
                 lead3 = lead(., 3),
                 lead6 = lead(., 6),)) %>% 
  ungroup()
test_fe %>% 
  group_by(cat) %>% 
  summarise_if(is.numeric, ~sum(ifelse(!is.na(.), 1, 0))) %>% 
  gather(key, val, -cat) %>% 
  group_by(cat) %>% 
  mutate(val_max = max(val)) %>% 
  filter(grepl("lag|lead", key)) %>% 
  mutate(val_pct = val / val_max)